File to be called for necessary cleaning of data for a particular csv file

In [1]:
# Packages module Code
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score  #Scikit Learn
import os
import random
import argparse
import requests
from tqdm import tqdm  #tqdm is used for progress bar
In [2]:
#Global variable
raw_data=0
In [3]:
def read_file(s):
    return pd.read_csv(s, sep=',')
raw_data=read_file('file1.csv')
raw_data
Out[3]:
ENGINE_ID CONDITION_ID Cycle_Time OpSet1 OpSet2 OpSet3 SensorMeasure1 SensorMeasure2 SensorMeasure3 SensorMeasure4 ... SensorMeasure13 SensorMeasure14 SensorMeasure15 SensorMeasure16 SensorMeasure17 SensorMeasure18 SensorMeasure19 SensorMeasure20 SensorMeasure21 RUL
0 1 1 1 -0.0007 -0.0004 100.0 518.67 641.82 1589.70 1400.60 ... 2388.02 8138.62 8.4195 0.03 392 2388 100.0 39.06 23.4190 191
1 1 1 2 0.0019 -0.0003 100.0 518.67 642.15 1591.82 1403.14 ... 2388.07 8131.49 8.4318 0.03 392 2388 100.0 39.00 23.4236 190
2 1 1 3 -0.0043 0.0003 100.0 518.67 642.35 1587.99 1404.20 ... 2388.03 8133.23 8.4178 0.03 390 2388 100.0 38.95 23.3442 189
3 1 1 4 0.0007 0.0000 100.0 518.67 642.35 1582.79 1401.87 ... 2388.08 8133.83 8.3682 0.03 392 2388 100.0 38.88 23.3739 188
4 1 1 5 -0.0019 -0.0002 100.0 518.67 642.37 1582.85 1406.22 ... 2388.04 8133.80 8.4294 0.03 393 2388 100.0 38.90 23.4044 187
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
20626 100 1 196 -0.0004 -0.0003 100.0 518.67 643.49 1597.98 1428.63 ... 2388.26 8137.60 8.4956 0.03 397 2388 100.0 38.49 22.9735 4
20627 100 1 197 -0.0016 -0.0005 100.0 518.67 643.54 1604.50 1433.58 ... 2388.22 8136.50 8.5139 0.03 395 2388 100.0 38.30 23.1594 3
20628 100 1 198 0.0004 0.0000 100.0 518.67 643.42 1602.46 1428.18 ... 2388.24 8141.05 8.5646 0.03 398 2388 100.0 38.44 22.9333 2
20629 100 1 199 -0.0011 0.0003 100.0 518.67 643.23 1605.26 1426.53 ... 2388.23 8139.29 8.5389 0.03 395 2388 100.0 38.29 23.0640 1
20630 100 1 200 -0.0032 -0.0005 100.0 518.67 643.85 1600.38 1432.14 ... 2388.26 8137.33 8.5036 0.03 396 2388 100.0 38.37 23.0522 0

20631 rows × 28 columns

In [4]:
raw_data.info()
# check how many original turbofan engine series we have
len(raw_data['ENGINE_ID'].unique())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20631 entries, 0 to 20630
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ENGINE_ID        20631 non-null  int64  
 1   CONDITION_ID     20631 non-null  int64  
 2   Cycle_Time       20631 non-null  int64  
 3   OpSet1           20631 non-null  float64
 4   OpSet2           20631 non-null  float64
 5   OpSet3           20631 non-null  float64
 6   SensorMeasure1   20631 non-null  float64
 7   SensorMeasure2   20631 non-null  float64
 8   SensorMeasure3   20631 non-null  float64
 9   SensorMeasure4   20631 non-null  float64
 10  SensorMeasure5   20631 non-null  float64
 11  SensorMeasure6   20631 non-null  float64
 12  SensorMeasure7   20631 non-null  float64
 13  SensorMeasure8   20631 non-null  float64
 14  SensorMeasure9   20631 non-null  float64
 15  SensorMeasure10  20631 non-null  float64
 16  SensorMeasure11  20631 non-null  float64
 17  SensorMeasure12  20631 non-null  float64
 18  SensorMeasure13  20631 non-null  float64
 19  SensorMeasure14  20631 non-null  float64
 20  SensorMeasure15  20631 non-null  float64
 21  SensorMeasure16  20631 non-null  float64
 22  SensorMeasure17  20631 non-null  int64  
 23  SensorMeasure18  20631 non-null  int64  
 24  SensorMeasure19  20631 non-null  float64
 25  SensorMeasure20  20631 non-null  float64
 26  SensorMeasure21  20631 non-null  float64
 27  RUL              20631 non-null  int64  
dtypes: float64(22), int64(6)
memory usage: 4.4 MB
Out[4]:
100
In [5]:
#Plot Engine ID and cycle Time
cnt_train=raw_data[['ENGINE_ID', 'Cycle_Time']].groupby('ENGINE_ID').max().sort_values(by='Cycle_Time', ascending=True)
cnt_ind = [str(i) for i in cnt_train.index.to_list()]
cnt_val = list(cnt_train.Cycle_Time.values)

plt.style.use("fivethirtyeight")
fig, ax = plt.subplots(figsize=(20, 80))
rects = ax.barh(cnt_ind, cnt_val)
ax.set_title("Number of Cycles of Different Engines", fontsize=22)
ax.set_ylabel('ENGINE_ID', fontsize=20)
ax.set_xlabel("Number of Cycles", fontsize=20)
ax.grid(True)

for rect in rects:
    width = rect.get_width()
    plt.text(1.05*rect.get_width(), rect.get_y()+0.5*rect.get_height(),
             '%d' % int(width),
             ha='center', va='center')
In [6]:
# Remove sensors with empty sensor values
cols_nan = raw_data.columns[raw_data.isna().any()].tolist()  
#.tolist() is used to convert data elements in array
#isna() Checks for any values and  .any() returns a boolean value
print('Columns without data: \n' + str(cols_nan) + '\n')

train_data = raw_data.drop(columns=cols_nan)
Columns without data: 
[]

In [7]:
# define column sets for the analysis
info_cols = ['ENGINE_ID', 'Cycle_Time', 'RUL','CONDITION_ID']
operational_cols = ['OpSet{}'.format(i + 1) for i in range (3)]
sensor_cols = ['SensorMeasure{}'.format(i + 1) for i in range(21)]

# plot an overview of whats going on in the columns
axes = train_data.describe().T.plot.bar(subplots=True, figsize=(30,15))

#raw_data.describe()
In [8]:
#Columns to be dropped based on STD and uniqueness
cols_const = [ col for col in raw_data.columns if len(raw_data[col].unique()) <= 2 ]  # searching for unique columns
print('Columns with constant values: \n' + str(cols_const) + '\n')
Columns with constant values: 
['CONDITION_ID', 'OpSet3', 'SensorMeasure1', 'SensorMeasure5', 'SensorMeasure6', 'SensorMeasure10', 'SensorMeasure16', 'SensorMeasure18', 'SensorMeasure19']

In [9]:
#Drop columns
raw_data=raw_data.drop(cols_const,axis=1)
raw_data
Out[9]:
ENGINE_ID Cycle_Time OpSet1 OpSet2 SensorMeasure2 SensorMeasure3 SensorMeasure4 SensorMeasure7 SensorMeasure8 SensorMeasure9 SensorMeasure11 SensorMeasure12 SensorMeasure13 SensorMeasure14 SensorMeasure15 SensorMeasure17 SensorMeasure20 SensorMeasure21 RUL
0 1 1 -0.0007 -0.0004 641.82 1589.70 1400.60 554.36 2388.06 9046.19 47.47 521.66 2388.02 8138.62 8.4195 392 39.06 23.4190 191
1 1 2 0.0019 -0.0003 642.15 1591.82 1403.14 553.75 2388.04 9044.07 47.49 522.28 2388.07 8131.49 8.4318 392 39.00 23.4236 190
2 1 3 -0.0043 0.0003 642.35 1587.99 1404.20 554.26 2388.08 9052.94 47.27 522.42 2388.03 8133.23 8.4178 390 38.95 23.3442 189
3 1 4 0.0007 0.0000 642.35 1582.79 1401.87 554.45 2388.11 9049.48 47.13 522.86 2388.08 8133.83 8.3682 392 38.88 23.3739 188
4 1 5 -0.0019 -0.0002 642.37 1582.85 1406.22 554.00 2388.06 9055.15 47.28 522.19 2388.04 8133.80 8.4294 393 38.90 23.4044 187
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
20626 100 196 -0.0004 -0.0003 643.49 1597.98 1428.63 551.43 2388.19 9065.52 48.07 519.49 2388.26 8137.60 8.4956 397 38.49 22.9735 4
20627 100 197 -0.0016 -0.0005 643.54 1604.50 1433.58 550.86 2388.23 9065.11 48.04 519.68 2388.22 8136.50 8.5139 395 38.30 23.1594 3
20628 100 198 0.0004 0.0000 643.42 1602.46 1428.18 550.94 2388.24 9065.90 48.09 520.01 2388.24 8141.05 8.5646 398 38.44 22.9333 2
20629 100 199 -0.0011 0.0003 643.23 1605.26 1426.53 550.68 2388.25 9073.72 48.39 519.67 2388.23 8139.29 8.5389 395 38.29 23.0640 1
20630 100 200 -0.0032 -0.0005 643.85 1600.38 1432.14 550.79 2388.26 9061.48 48.20 519.30 2388.26 8137.33 8.5036 396 38.37 23.0522 0

20631 rows × 19 columns

In [10]:
#Updating operational cols and sensor cols as some cols have been dropped
for i in cols_const:
    if i in operational_cols:
         operational_cols.remove(i)
    elif i in sensor_cols:
        sensor_cols.remove(i)
    else:
        info_cols.remove(i)
In [11]:
#Correlation heat Map
corrmat = raw_data.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(58,58))
#plt.savefig('Corrplot.png',bbox_inches='tight')
sns.set(font_scale=4,font="Times New Roman")

#plot heat map
g=sns.heatmap(raw_data[top_corr_features].corr(),cmap="RdYlGn", linewidths=0.1, annot=True, annot_kws={"size":35})
g.set_xticklabels(g.get_xmajorticklabels(), fontsize = 35)
g.set_yticklabels(g.get_xmajorticklabels(), fontsize = 35)
#plt.savefig(r'C:\Users\mohan\Desktop\Mtech_project\Plot_images\Corrplot.svg',format='svg')
Out[11]:
[Text(0, 0.5, 'ENGINE_ID'),
 Text(0, 1.5, 'Cycle_Time'),
 Text(0, 2.5, 'OpSet1'),
 Text(0, 3.5, 'OpSet2'),
 Text(0, 4.5, 'SensorMeasure2'),
 Text(0, 5.5, 'SensorMeasure3'),
 Text(0, 6.5, 'SensorMeasure4'),
 Text(0, 7.5, 'SensorMeasure7'),
 Text(0, 8.5, 'SensorMeasure8'),
 Text(0, 9.5, 'SensorMeasure9'),
 Text(0, 10.5, 'SensorMeasure11'),
 Text(0, 11.5, 'SensorMeasure12'),
 Text(0, 12.5, 'SensorMeasure13'),
 Text(0, 13.5, 'SensorMeasure14'),
 Text(0, 14.5, 'SensorMeasure15'),
 Text(0, 15.5, 'SensorMeasure17'),
 Text(0, 16.5, 'SensorMeasure20'),
 Text(0, 17.5, 'SensorMeasure21'),
 Text(0, 18.5, 'RUL')]
In [12]:
#Find columns which have Correlation less than 0.2 with RUL
delete_columns=[]
corr_value=0.2

for col in raw_data.columns:
    corr = raw_data[col].corr(raw_data['RUL'])
    #print(col,corr)
    if abs(corr)>corr_value:
        pass
    else:    
        delete_columns.append(col)
        
delete_columns
Out[12]:
['ENGINE_ID', 'OpSet1', 'OpSet2']
In [13]:
#Drop columns
raw_data=raw_data.drop(delete_columns,axis=1)

#New Correlation Map
corrmat = raw_data.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(58,58))
#plt.savefig('Corrplot.png',bbox_inches='tight')
sns.set(font_scale=4,font="Times New Roman")

#plot heat map
g=sns.heatmap(raw_data[top_corr_features].corr(),cmap="RdYlGn", linewidths=0.1, annot=True, annot_kws={"size":35})
g.set_xticklabels(g.get_xmajorticklabels(), fontsize = 35)
g.set_yticklabels(g.get_xmajorticklabels(), fontsize = 35)
#plt.savefig(r'C:\Users\mohan\Desktop\Mtech_project\Plot_images\Corrplot.svg',format='svg')
Out[13]:
[Text(0, 0.5, 'Cycle_Time'),
 Text(0, 1.5, 'SensorMeasure2'),
 Text(0, 2.5, 'SensorMeasure3'),
 Text(0, 3.5, 'SensorMeasure4'),
 Text(0, 4.5, 'SensorMeasure7'),
 Text(0, 5.5, 'SensorMeasure8'),
 Text(0, 6.5, 'SensorMeasure9'),
 Text(0, 7.5, 'SensorMeasure11'),
 Text(0, 8.5, 'SensorMeasure12'),
 Text(0, 9.5, 'SensorMeasure13'),
 Text(0, 10.5, 'SensorMeasure14'),
 Text(0, 11.5, 'SensorMeasure15'),
 Text(0, 12.5, 'SensorMeasure17'),
 Text(0, 13.5, 'SensorMeasure20'),
 Text(0, 14.5, 'SensorMeasure21'),
 Text(0, 15.5, 'RUL')]
In [14]:
#Feature importance selection using xgboost  (Which to use?)  ***** (ask on Monday)
X = raw_data.iloc[:, 0:13]
y = raw_data.iloc[:, 13]

from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
model = DecisionTreeRegressor()
#model = ExtraTreesRegressor()
model.fit(X,y)
print(model.feature_importances_)
[0.03638035 0.03082732 0.03158248 0.05808209 0.03978926 0.02214296
 0.02869711 0.58716652 0.06592178 0.02185765 0.0258966  0.04176318
 0.00989269]
In [15]:
#Plot feature importances
feat_importances = pd.Series(model.feature_importances_,index = X.columns)
feat_importances=feat_importances.sort_values( ascending=False)
feat_importances.nlargest(13).plot(kind='barh',fontsize =13)
Out[15]:
<AxesSubplot:>